In [1]:
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
In [2]:
# Import CSV file
import warnings
warnings.filterwarnings ("ignore", category = UserWarning)
# Import the Pandas package.
import pandas as pd
# Create a variable named data_file_name to hold the name of the data file.
data_file_name = "spotify_songs.csv"
# Load the data file into a Pandas DataFrame.
df = pd.read_csv(data_file_name)
# Verify the DataFrame contents by printing the first 5 rows of data.
df.head(5)
Out[2]:
| track_id | track_name | track_artist | track_popularity | track_album_id | track_album_name | track_album_release_date | playlist_name | playlist_id | playlist_genre | ... | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6f807x0ima9a1j3VPbc7VN | I Don't Care (with Justin Bieber) - Loud Luxur... | Ed Sheeran | 66 | 2oCs0DGTsRO98Gh5ZSl2Cx | I Don't Care (with Justin Bieber) [Loud Luxury... | 2019-06-14 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 6 | -2.634 | 1 | 0.0583 | 0.1020 | 0.000000 | 0.0653 | 0.518 | 122.036 | 194754 |
| 1 | 0r7CVbZTWZgbTCYdfa2P31 | Memories - Dillon Francis Remix | Maroon 5 | 67 | 63rPSO264uRjW1X5E6cWv6 | Memories (Dillon Francis Remix) | 2019-12-13 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 11 | -4.969 | 1 | 0.0373 | 0.0724 | 0.004210 | 0.3570 | 0.693 | 99.972 | 162600 |
| 2 | 1z1Hg7Vb0AhHDiEmnDE79l | All the Time - Don Diablo Remix | Zara Larsson | 70 | 1HoSmj2eLcsrR0vE9gThr4 | All the Time (Don Diablo Remix) | 2019-07-05 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 1 | -3.432 | 0 | 0.0742 | 0.0794 | 0.000023 | 0.1100 | 0.613 | 124.008 | 176616 |
| 3 | 75FpbthrwQmzHlBJLuGdC7 | Call You Mine - Keanu Silva Remix | The Chainsmokers | 60 | 1nqYsOef1yKKuGOVchbsk6 | Call You Mine - The Remixes | 2019-07-19 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 7 | -3.778 | 1 | 0.1020 | 0.0287 | 0.000009 | 0.2040 | 0.277 | 121.956 | 169093 |
| 4 | 1e8PAfcKUYoKkxPhrHqw4x | Someone You Loved - Future Humans Remix | Lewis Capaldi | 69 | 7m7vv9wlQ4i0LFuJiE2zsQ | Someone You Loved (Future Humans Remix) | 2019-03-05 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 1 | -4.672 | 1 | 0.0359 | 0.0803 | 0.000000 | 0.0833 | 0.725 | 123.976 | 189052 |
5 rows × 23 columns
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32833 entries, 0 to 32832 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 track_id 32833 non-null object 1 track_name 32828 non-null object 2 track_artist 32828 non-null object 3 track_popularity 32833 non-null int64 4 track_album_id 32833 non-null object 5 track_album_name 32828 non-null object 6 track_album_release_date 32833 non-null object 7 playlist_name 32833 non-null object 8 playlist_id 32833 non-null object 9 playlist_genre 32833 non-null object 10 playlist_subgenre 32833 non-null object 11 danceability 32833 non-null float64 12 energy 32833 non-null float64 13 key 32833 non-null int64 14 loudness 32833 non-null float64 15 mode 32833 non-null int64 16 speechiness 32833 non-null float64 17 acousticness 32833 non-null float64 18 instrumentalness 32833 non-null float64 19 liveness 32833 non-null float64 20 valence 32833 non-null float64 21 tempo 32833 non-null float64 22 duration_ms 32833 non-null int64 dtypes: float64(9), int64(4), object(10) memory usage: 5.8+ MB
In [4]:
# Shape of the dataset
shape = df.shape
print(shape)
unique_rows = df["track_id"].unique().shape[0]
print(unique_rows)
(32833, 23) 28356
In [5]:
# Number of null values in each column
df.isnull().sum()
Out[5]:
track_id 0 track_name 5 track_artist 5 track_popularity 0 track_album_id 0 track_album_name 5 track_album_release_date 0 playlist_name 0 playlist_id 0 playlist_genre 0 playlist_subgenre 0 danceability 0 energy 0 key 0 loudness 0 mode 0 speechiness 0 acousticness 0 instrumentalness 0 liveness 0 valence 0 tempo 0 duration_ms 0 dtype: int64
In [6]:
# Save Column names
col_names = df.columns
In [7]:
# Discretize Popularity of each song
# create blank column of zeroes
col = np.zeros((shape[0],1))
df_popularity_category = pd.DataFrame(col)
print(type(df_popularity_category))
# concatenate the dataframes
df = pd.concat([df,df_popularity_category], axis = 1)
df.head(5)
<class 'pandas.core.frame.DataFrame'>
Out[7]:
| track_id | track_name | track_artist | track_popularity | track_album_id | track_album_name | track_album_release_date | playlist_name | playlist_id | playlist_genre | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | 0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6f807x0ima9a1j3VPbc7VN | I Don't Care (with Justin Bieber) - Loud Luxur... | Ed Sheeran | 66 | 2oCs0DGTsRO98Gh5ZSl2Cx | I Don't Care (with Justin Bieber) [Loud Luxury... | 2019-06-14 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | -2.634 | 1 | 0.0583 | 0.1020 | 0.000000 | 0.0653 | 0.518 | 122.036 | 194754 | 0.0 |
| 1 | 0r7CVbZTWZgbTCYdfa2P31 | Memories - Dillon Francis Remix | Maroon 5 | 67 | 63rPSO264uRjW1X5E6cWv6 | Memories (Dillon Francis Remix) | 2019-12-13 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | -4.969 | 1 | 0.0373 | 0.0724 | 0.004210 | 0.3570 | 0.693 | 99.972 | 162600 | 0.0 |
| 2 | 1z1Hg7Vb0AhHDiEmnDE79l | All the Time - Don Diablo Remix | Zara Larsson | 70 | 1HoSmj2eLcsrR0vE9gThr4 | All the Time (Don Diablo Remix) | 2019-07-05 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | -3.432 | 0 | 0.0742 | 0.0794 | 0.000023 | 0.1100 | 0.613 | 124.008 | 176616 | 0.0 |
| 3 | 75FpbthrwQmzHlBJLuGdC7 | Call You Mine - Keanu Silva Remix | The Chainsmokers | 60 | 1nqYsOef1yKKuGOVchbsk6 | Call You Mine - The Remixes | 2019-07-19 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | -3.778 | 1 | 0.1020 | 0.0287 | 0.000009 | 0.2040 | 0.277 | 121.956 | 169093 | 0.0 |
| 4 | 1e8PAfcKUYoKkxPhrHqw4x | Someone You Loved - Future Humans Remix | Lewis Capaldi | 69 | 7m7vv9wlQ4i0LFuJiE2zsQ | Someone You Loved (Future Humans Remix) | 2019-03-05 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | -4.672 | 1 | 0.0359 | 0.0803 | 0.000000 | 0.0833 | 0.725 | 123.976 | 189052 | 0.0 |
5 rows × 24 columns
In [8]:
median_popularity = df["track_popularity"].median()
df["popularity_category"] = (df["track_popularity"] > median_popularity).astype(int)
df.head()
Out[8]:
| track_id | track_name | track_artist | track_popularity | track_album_id | track_album_name | track_album_release_date | playlist_name | playlist_id | playlist_genre | ... | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | 0 | popularity_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6f807x0ima9a1j3VPbc7VN | I Don't Care (with Justin Bieber) - Loud Luxur... | Ed Sheeran | 66 | 2oCs0DGTsRO98Gh5ZSl2Cx | I Don't Care (with Justin Bieber) [Loud Luxury... | 2019-06-14 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 1 | 0.0583 | 0.1020 | 0.000000 | 0.0653 | 0.518 | 122.036 | 194754 | 0.0 | 1 |
| 1 | 0r7CVbZTWZgbTCYdfa2P31 | Memories - Dillon Francis Remix | Maroon 5 | 67 | 63rPSO264uRjW1X5E6cWv6 | Memories (Dillon Francis Remix) | 2019-12-13 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 1 | 0.0373 | 0.0724 | 0.004210 | 0.3570 | 0.693 | 99.972 | 162600 | 0.0 | 1 |
| 2 | 1z1Hg7Vb0AhHDiEmnDE79l | All the Time - Don Diablo Remix | Zara Larsson | 70 | 1HoSmj2eLcsrR0vE9gThr4 | All the Time (Don Diablo Remix) | 2019-07-05 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 0 | 0.0742 | 0.0794 | 0.000023 | 0.1100 | 0.613 | 124.008 | 176616 | 0.0 | 1 |
| 3 | 75FpbthrwQmzHlBJLuGdC7 | Call You Mine - Keanu Silva Remix | The Chainsmokers | 60 | 1nqYsOef1yKKuGOVchbsk6 | Call You Mine - The Remixes | 2019-07-19 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 1 | 0.1020 | 0.0287 | 0.000009 | 0.2040 | 0.277 | 121.956 | 169093 | 0.0 | 1 |
| 4 | 1e8PAfcKUYoKkxPhrHqw4x | Someone You Loved - Future Humans Remix | Lewis Capaldi | 69 | 7m7vv9wlQ4i0LFuJiE2zsQ | Someone You Loved (Future Humans Remix) | 2019-03-05 | Pop Remix | 37i9dQZF1DXcZDD7cfEKhW | pop | ... | 1 | 0.0359 | 0.0803 | 0.000000 | 0.0833 | 0.725 | 123.976 | 189052 | 0.0 | 1 |
5 rows × 25 columns
In [9]:
# Start Data Preprocessing
# Find features to remove:
# Don't need playlist data: playlist_name, playlist_id, playlist_genre, playlist_subgenre
featuresToRemove = []
featuresToRemove.append("playlist_name")
featuresToRemove.append("playlist_id")
for featureToRemove in featuresToRemove:
del df[featureToRemove]
In [10]:
df.head(5)
Out[10]:
| track_id | track_name | track_artist | track_popularity | track_album_id | track_album_name | track_album_release_date | playlist_genre | playlist_subgenre | danceability | ... | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | 0 | popularity_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6f807x0ima9a1j3VPbc7VN | I Don't Care (with Justin Bieber) - Loud Luxur... | Ed Sheeran | 66 | 2oCs0DGTsRO98Gh5ZSl2Cx | I Don't Care (with Justin Bieber) [Loud Luxury... | 2019-06-14 | pop | dance pop | 0.748 | ... | 1 | 0.0583 | 0.1020 | 0.000000 | 0.0653 | 0.518 | 122.036 | 194754 | 0.0 | 1 |
| 1 | 0r7CVbZTWZgbTCYdfa2P31 | Memories - Dillon Francis Remix | Maroon 5 | 67 | 63rPSO264uRjW1X5E6cWv6 | Memories (Dillon Francis Remix) | 2019-12-13 | pop | dance pop | 0.726 | ... | 1 | 0.0373 | 0.0724 | 0.004210 | 0.3570 | 0.693 | 99.972 | 162600 | 0.0 | 1 |
| 2 | 1z1Hg7Vb0AhHDiEmnDE79l | All the Time - Don Diablo Remix | Zara Larsson | 70 | 1HoSmj2eLcsrR0vE9gThr4 | All the Time (Don Diablo Remix) | 2019-07-05 | pop | dance pop | 0.675 | ... | 0 | 0.0742 | 0.0794 | 0.000023 | 0.1100 | 0.613 | 124.008 | 176616 | 0.0 | 1 |
| 3 | 75FpbthrwQmzHlBJLuGdC7 | Call You Mine - Keanu Silva Remix | The Chainsmokers | 60 | 1nqYsOef1yKKuGOVchbsk6 | Call You Mine - The Remixes | 2019-07-19 | pop | dance pop | 0.718 | ... | 1 | 0.1020 | 0.0287 | 0.000009 | 0.2040 | 0.277 | 121.956 | 169093 | 0.0 | 1 |
| 4 | 1e8PAfcKUYoKkxPhrHqw4x | Someone You Loved - Future Humans Remix | Lewis Capaldi | 69 | 7m7vv9wlQ4i0LFuJiE2zsQ | Someone You Loved (Future Humans Remix) | 2019-03-05 | pop | dance pop | 0.650 | ... | 1 | 0.0359 | 0.0803 | 0.000000 | 0.0833 | 0.725 | 123.976 | 189052 | 0.0 | 1 |
5 rows × 23 columns
In [11]:
# find the number of duplicate track IDs
ids = df["track_id"]
id_counts = ids.value_counts()
duplicated_ids = id_counts[id_counts > 1]
duplicated_track_ids = duplicated_ids.index
# now find full rows of duplicated IDs
condition = df["track_id"].isin(duplicated_track_ids)
duplicated_rows = df[condition]
duplicated_rows = duplicated_rows.sort_values(by = "track_id")
duplicated_rows.head(60)
Out[11]:
| track_id | track_name | track_artist | track_popularity | track_album_id | track_album_name | track_album_release_date | playlist_genre | playlist_subgenre | danceability | ... | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | 0 | popularity_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 32084 | 00Gu3RMpDW2vO9PjlMVFDL | Hide Away (feat. Envy Monroe) | Blasterjaxx | 42 | 5pqG85igfoeWcCDIsSi9x7 | Hide Away (feat. Envy Monroe) | 2019-06-21 | edm | progressive electro house | 0.573 | ... | 1 | 0.0421 | 0.02490 | 0.000000 | 0.3610 | 0.134 | 130.001 | 188000 | 0.0 | 0 |
| 28696 | 00Gu3RMpDW2vO9PjlMVFDL | Hide Away (feat. Envy Monroe) | Blasterjaxx | 42 | 5pqG85igfoeWcCDIsSi9x7 | Hide Away (feat. Envy Monroe) | 2019-06-21 | edm | big room | 0.573 | ... | 1 | 0.0421 | 0.02490 | 0.000000 | 0.3610 | 0.134 | 130.001 | 188000 | 0.0 | 0 |
| 23850 | 00QyLmjxaSEE8qIZQjBXBj | We Own It (Fast & Furious) | 2 Chainz | 59 | 1jg2UPoSAr7CDPsEXcabo1 | Fast & Furious 6 | 2013-01-01 | r&b | hip pop | 0.554 | ... | 1 | 0.4080 | 0.05210 | 0.000000 | 0.0568 | 0.552 | 171.966 | 227893 | 0.0 | 1 |
| 28968 | 00QyLmjxaSEE8qIZQjBXBj | We Own It (Fast & Furious) | 2 Chainz | 59 | 1jg2UPoSAr7CDPsEXcabo1 | Fast & Furious 6 | 2013-01-01 | edm | big room | 0.554 | ... | 1 | 0.4080 | 0.05210 | 0.000000 | 0.0568 | 0.552 | 171.966 | 227893 | 0.0 | 1 |
| 9387 | 00QyLmjxaSEE8qIZQjBXBj | We Own It (Fast & Furious) | 2 Chainz | 59 | 1jg2UPoSAr7CDPsEXcabo1 | Fast & Furious 6 | 2013-01-01 | rap | gangster rap | 0.554 | ... | 1 | 0.4080 | 0.05210 | 0.000000 | 0.0568 | 0.552 | 171.966 | 227893 | 0.0 | 1 |
| 7853 | 00ReeHCY0FQUyuAUyPJdnk | Ain't No Future In Yo' Frontin' | MC Breed | 48 | 7mLks5uEIPmT0056mb5oV3 | MC Breed & DFC | 1991-01-01 | rap | southern hip hop | 0.672 | ... | 1 | 0.2480 | 0.05140 | 0.000000 | 0.4740 | 0.731 | 102.192 | 244733 | 0.0 | 1 |
| 9345 | 00ReeHCY0FQUyuAUyPJdnk | Ain't No Future In Yo' Frontin' | MC Breed | 48 | 7mLks5uEIPmT0056mb5oV3 | MC Breed & DFC | 1991-01-01 | rap | gangster rap | 0.672 | ... | 1 | 0.2480 | 0.05140 | 0.000000 | 0.4740 | 0.731 | 102.192 | 244733 | 0.0 | 1 |
| 3124 | 00WIXhVVhswHuS6dlkScuw | Hot | Confetti | 51 | 6nsXqX8wZbkiqSKmSqxsuT | Hot | 2019-11-22 | pop | electropop | 0.607 | ... | 1 | 0.1070 | 0.00297 | 0.000002 | 0.1200 | 0.664 | 168.015 | 150714 | 0.0 | 1 |
| 14626 | 00WIXhVVhswHuS6dlkScuw | Hot | Confetti | 51 | 6nsXqX8wZbkiqSKmSqxsuT | Hot | 2019-11-22 | rock | permanent wave | 0.607 | ... | 1 | 0.1070 | 0.00297 | 0.000002 | 0.1200 | 0.664 | 168.015 | 150714 | 0.0 | 1 |
| 18487 | 00i0O74dXdaKKdCrqHnfXm | La Mordidita | Ricky Martin | 69 | 375cUd86z58eqXN2yW3Do9 | A Quien Quiera Escuchar (Deluxe Edition) | 2015-02-10 | latin | latin pop | 0.725 | ... | 1 | 0.0658 | 0.03440 | 0.000000 | 0.1830 | 0.946 | 142.006 | 211680 | 0.0 | 1 |
| 21198 | 00i0O74dXdaKKdCrqHnfXm | La Mordidita | Ricky Martin | 69 | 375cUd86z58eqXN2yW3Do9 | A Quien Quiera Escuchar (Deluxe Edition) | 2015-02-10 | latin | latin hip hop | 0.725 | ... | 1 | 0.0658 | 0.03440 | 0.000000 | 0.1830 | 0.946 | 142.006 | 211680 | 0.0 | 1 |
| 1903 | 00qOE7OjRl0BpYiCiweZB2 | Juke Box Hero | Foreigner | 67 | 2Pw51hAGvWpTA3AYl2WVuu | 4 (Expanded) | 1981 | pop | post-teen pop | 0.357 | ... | 1 | 0.0654 | 0.08280 | 0.000000 | 0.0844 | 0.522 | 176.647 | 259800 | 0.0 | 1 |
| 12529 | 00qOE7OjRl0BpYiCiweZB2 | Juke Box Hero | Foreigner | 67 | 2Pw51hAGvWpTA3AYl2WVuu | 4 (Expanded) | 1981 | rock | classic rock | 0.357 | ... | 1 | 0.0654 | 0.08280 | 0.000000 | 0.0844 | 0.522 | 176.647 | 259800 | 0.0 | 1 |
| 15244 | 01R0Xdwje645C6xFCnMRvm | Talk Dirty To Me | Poison | 54 | 0xOBnypzEh4WKROJ51LL09 | Look What The Cat Dragged In | 1986-01-01 | rock | hard rock | 0.507 | ... | 1 | 0.0440 | 0.02110 | 0.002480 | 0.3810 | 0.708 | 157.996 | 223960 | 0.0 | 1 |
| 12979 | 01R0Xdwje645C6xFCnMRvm | Talk Dirty To Me | Poison | 54 | 0xOBnypzEh4WKROJ51LL09 | Look What The Cat Dragged In | 1986-01-01 | rock | classic rock | 0.507 | ... | 1 | 0.0440 | 0.02110 | 0.002480 | 0.3810 | 0.708 | 157.996 | 223960 | 0.0 | 1 |
| 121 | 01iyINEYgPQ7ThMZuHUsqS | First Love | Lost Kings | 58 | 7syMmofF2t1xI0RFCtrSG9 | First Love | 2017-10-13 | pop | dance pop | 0.619 | ... | 1 | 0.3500 | 0.02570 | 0.000014 | 0.1280 | 0.601 | 94.380 | 207428 | 0.0 | 1 |
| 17491 | 01iyINEYgPQ7ThMZuHUsqS | First Love | Lost Kings | 58 | 7syMmofF2t1xI0RFCtrSG9 | First Love | 2017-10-13 | latin | tropical | 0.619 | ... | 1 | 0.3500 | 0.02570 | 0.000014 | 0.1280 | 0.601 | 94.380 | 207428 | 0.0 | 1 |
| 12828 | 02138lFv3Bzncr6ScNbLAF | Rattle Your Cage | Skrizzly Adams | 44 | 3RcttHMKlZ7K1ovxIANPd0 | Young Man | 2019-11-15 | rock | classic rock | 0.583 | ... | 1 | 0.1230 | 0.02700 | 0.000000 | 0.0908 | 0.498 | 179.874 | 209972 | 0.0 | 0 |
| 15159 | 02138lFv3Bzncr6ScNbLAF | Rattle Your Cage | Skrizzly Adams | 44 | 3RcttHMKlZ7K1ovxIANPd0 | Young Man | 2019-11-15 | rock | hard rock | 0.583 | ... | 1 | 0.1230 | 0.02700 | 0.000000 | 0.0908 | 0.498 | 179.874 | 209972 | 0.0 | 0 |
| 4953 | 0240T0gP9w6xEgIciBrfVF | Talk Is Cheap | Nick Murphy / Chet Faker | 61 | 2ytxqdwQ0Hn9JeQmcIWHuh | Built on Glass | 2014-04-14 | pop | indie poptimism | 0.656 | ... | 0 | 0.1710 | 0.41600 | 0.000172 | 0.2450 | 0.520 | 140.058 | 218067 | 0.0 | 1 |
| 26173 | 0240T0gP9w6xEgIciBrfVF | Talk Is Cheap | Nick Murphy / Chet Faker | 61 | 2ytxqdwQ0Hn9JeQmcIWHuh | Built on Glass | 2014-04-14 | r&b | neo soul | 0.656 | ... | 0 | 0.1710 | 0.41600 | 0.000172 | 0.2450 | 0.520 | 140.058 | 218067 | 0.0 | 1 |
| 4169 | 02CygBCQOIyEuhNZqHHcNx | It Runs Through Me | Tom Misch | 67 | 28enuddLPEA914scE6Drvk | Geography | 2018-04-06 | pop | indie poptimism | 0.802 | ... | 0 | 0.2890 | 0.20500 | 0.000748 | 0.2460 | 0.274 | 96.916 | 261881 | 0.0 | 1 |
| 25425 | 02CygBCQOIyEuhNZqHHcNx | It Runs Through Me | Tom Misch | 67 | 28enuddLPEA914scE6Drvk | Geography | 2018-04-06 | r&b | neo soul | 0.802 | ... | 0 | 0.2890 | 0.20500 | 0.000748 | 0.2460 | 0.274 | 96.916 | 261881 | 0.0 | 1 |
| 2703 | 02M6vucOvmRfMxTXDUwRXu | 7/11 | Beyoncé | 71 | 2UJwKSBUz6rtW4QLK74kQu | BEYONCÉ [Platinum Edition] | 2014-11-24 | pop | electropop | 0.747 | ... | 0 | 0.1260 | 0.01280 | 0.000000 | 0.1260 | 0.560 | 136.024 | 213507 | 0.0 | 1 |
| 22268 | 02M6vucOvmRfMxTXDUwRXu | 7/11 | Beyoncé | 71 | 2UJwKSBUz6rtW4QLK74kQu | BEYONCÉ [Platinum Edition] | 2014-11-24 | r&b | urban contemporary | 0.747 | ... | 0 | 0.1260 | 0.01280 | 0.000000 | 0.1260 | 0.560 | 136.024 | 213507 | 0.0 | 1 |
| 7325 | 02M6vucOvmRfMxTXDUwRXu | 7/11 | Beyoncé | 71 | 2UJwKSBUz6rtW4QLK74kQu | BEYONCÉ [Platinum Edition] | 2014-11-24 | rap | southern hip hop | 0.747 | ... | 0 | 0.1260 | 0.01280 | 0.000000 | 0.1260 | 0.560 | 136.024 | 213507 | 0.0 | 1 |
| 23061 | 02M6vucOvmRfMxTXDUwRXu | 7/11 | Beyoncé | 71 | 2UJwKSBUz6rtW4QLK74kQu | BEYONCÉ [Platinum Edition] | 2014-11-24 | r&b | hip pop | 0.747 | ... | 0 | 0.1260 | 0.01280 | 0.000000 | 0.1260 | 0.560 | 136.024 | 213507 | 0.0 | 1 |
| 214 | 02itaCXOdC54J0ISjqqFAp | All Around The World (La La La) | R3HAB | 80 | 0Y59j5oCvwTM2aNyPb6YpJ | All Around The World (La La La) | 2019-04-05 | pop | dance pop | 0.733 | ... | 0 | 0.0330 | 0.47900 | 0.064500 | 0.1050 | 0.520 | 124.948 | 147840 | 0.0 | 1 |
| 29639 | 02itaCXOdC54J0ISjqqFAp | All Around The World (La La La) | R3HAB | 80 | 0Y59j5oCvwTM2aNyPb6YpJ | All Around The World (La La La) | 2019-04-05 | edm | pop edm | 0.733 | ... | 0 | 0.0330 | 0.47900 | 0.064500 | 0.1050 | 0.520 | 124.948 | 147840 | 0.0 | 1 |
| 18896 | 02lGHA7bFFplYLihnUWTx8 | Tequila | Juanes | 77 | 2X45SzRfAFsxgkBxgryWfF | Más Futuro Que Pasado | 2019-11-22 | latin | reggaeton | 0.757 | ... | 1 | 0.1120 | 0.22600 | 0.000000 | 0.2370 | 0.704 | 155.994 | 159547 | 0.0 | 1 |
| 17611 | 02lGHA7bFFplYLihnUWTx8 | Tequila | Juanes | 77 | 2X45SzRfAFsxgkBxgryWfF | Más Futuro Que Pasado | 2019-11-22 | latin | latin pop | 0.757 | ... | 1 | 0.1120 | 0.22600 | 0.000000 | 0.2370 | 0.704 | 155.994 | 159547 | 0.0 | 1 |
| 27762 | 02q7qbOYbE89NMFEtOklcc | Unity | Dimitri Vegas & Like Mike | 66 | 5mnKVK3cksHH5Lzm1OZpiN | Unity | 2018-07-20 | edm | electro house | 0.650 | ... | 0 | 0.0406 | 0.02410 | 0.800000 | 0.2930 | 0.191 | 129.999 | 234462 | 0.0 | 1 |
| 28409 | 02q7qbOYbE89NMFEtOklcc | Unity | Dimitri Vegas & Like Mike | 66 | 5mnKVK3cksHH5Lzm1OZpiN | Unity | 2018-07-20 | edm | big room | 0.650 | ... | 0 | 0.0406 | 0.02410 | 0.800000 | 0.2930 | 0.191 | 129.999 | 234462 | 0.0 | 1 |
| 31530 | 02q7qbOYbE89NMFEtOklcc | Unity | Dimitri Vegas & Like Mike | 66 | 5mnKVK3cksHH5Lzm1OZpiN | Unity | 2018-07-20 | edm | progressive electro house | 0.650 | ... | 0 | 0.0406 | 0.02410 | 0.800000 | 0.2930 | 0.191 | 129.999 | 234462 | 0.0 | 1 |
| 31042 | 037yW9RzsLze4OmBYmcH4G | Save My Night - BlasterJaxx Remix | Armin van Buuren | 16 | 16AD0yLkUKcp2FBtXKOE9j | Save My Night (BlasterJaxx Remix) | 2014-03-10 | edm | progressive electro house | 0.686 | ... | 1 | 0.0475 | 0.00196 | 0.745000 | 0.0520 | 0.165 | 129.972 | 326827 | 0.0 | 0 |
| 27979 | 037yW9RzsLze4OmBYmcH4G | Save My Night - BlasterJaxx Remix | Armin van Buuren | 16 | 16AD0yLkUKcp2FBtXKOE9j | Save My Night (BlasterJaxx Remix) | 2014-03-10 | edm | electro house | 0.686 | ... | 1 | 0.0475 | 0.00196 | 0.745000 | 0.0520 | 0.165 | 129.972 | 326827 | 0.0 | 0 |
| 26909 | 03m9WRVBzoxyTeKblvLvpR | You Got The Love | Keanu Silva | 62 | 14B4NCJRvKpfyQBAGBCJT4 | You Got The Love | 2019-10-04 | edm | electro house | 0.744 | ... | 1 | 0.0558 | 0.03940 | 0.000306 | 0.2950 | 0.700 | 124.046 | 164446 | 0.0 | 1 |
| 29965 | 03m9WRVBzoxyTeKblvLvpR | You Got The Love | Keanu Silva | 62 | 14B4NCJRvKpfyQBAGBCJT4 | You Got The Love | 2019-10-04 | edm | pop edm | 0.744 | ... | 1 | 0.0558 | 0.03940 | 0.000306 | 0.2950 | 0.700 | 124.046 | 164446 | 0.0 | 1 |
| 6012 | 03tqyYWC9Um2ZqU0ZN849H | No Hands (feat. Roscoe Dash & Wale) | Waka Flocka Flame | 74 | 6MQtWELG7aRX7CkAzQ6nLM | Flockaveli | 2010-10-01 | rap | hip hop | 0.760 | ... | 1 | 0.0391 | 0.00544 | 0.000000 | 0.2410 | 0.361 | 131.497 | 263773 | 0.0 | 1 |
| 7523 | 03tqyYWC9Um2ZqU0ZN849H | No Hands (feat. Roscoe Dash & Wale) | Waka Flocka Flame | 74 | 6MQtWELG7aRX7CkAzQ6nLM | Flockaveli | 2010-10-01 | rap | southern hip hop | 0.760 | ... | 1 | 0.0391 | 0.00544 | 0.000000 | 0.2410 | 0.361 | 131.497 | 263773 | 0.0 | 1 |
| 23363 | 04KTF78FFg8sOHC1BADqbY | Hot In Herre | Nelly | 71 | 4HUUHHXBXImwksfbSPqE7q | Nellyville | 2002-06-25 | r&b | hip pop | 0.956 | ... | 0 | 0.1200 | 0.20600 | 0.000000 | 0.0615 | 0.912 | 107.075 | 228240 | 0.0 | 1 |
| 7236 | 04KTF78FFg8sOHC1BADqbY | Hot In Herre | Nelly | 71 | 4HUUHHXBXImwksfbSPqE7q | Nellyville | 2002-06-25 | rap | southern hip hop | 0.956 | ... | 0 | 0.1200 | 0.20600 | 0.000000 | 0.0615 | 0.912 | 107.075 | 228240 | 0.0 | 1 |
| 6315 | 04MLEeAMuV9IlHEsD8vF6A | No Stylist | French Montana | 76 | 0DMvfJWc1DjSbmnJF1NW1o | No Stylist | 2018-09-20 | rap | hip hop | 0.765 | ... | 0 | 0.1270 | 0.02150 | 0.000000 | 0.2270 | 0.498 | 147.055 | 192172 | 0.0 | 1 |
| 9424 | 04MLEeAMuV9IlHEsD8vF6A | No Stylist | French Montana | 76 | 0DMvfJWc1DjSbmnJF1NW1o | No Stylist | 2018-09-20 | rap | gangster rap | 0.765 | ... | 0 | 0.1270 | 0.02150 | 0.000000 | 0.2270 | 0.498 | 147.055 | 192172 | 0.0 | 1 |
| 11228 | 04MLEeAMuV9IlHEsD8vF6A | No Stylist | French Montana | 76 | 0DMvfJWc1DjSbmnJF1NW1o | No Stylist | 2018-09-20 | rap | trap | 0.765 | ... | 0 | 0.1270 | 0.02150 | 0.000000 | 0.2270 | 0.498 | 147.055 | 192172 | 0.0 | 1 |
| 4683 | 04ZTP5KsCypmtCmQg5tH9R | I'm a Mess | Bebe Rexha | 80 | 4TOkZvtqNpg5UHyGxCn0mS | Expectations | 2018-06-22 | pop | indie poptimism | 0.630 | ... | 0 | 0.0253 | 0.00281 | 0.000000 | 0.0719 | 0.216 | 97.005 | 195519 | 0.0 | 1 |
| 23280 | 04ZTP5KsCypmtCmQg5tH9R | I'm a Mess | Bebe Rexha | 80 | 4TOkZvtqNpg5UHyGxCn0mS | Expectations | 2018-06-22 | r&b | hip pop | 0.630 | ... | 0 | 0.0253 | 0.00281 | 0.000000 | 0.0719 | 0.216 | 97.005 | 195519 | 0.0 | 1 |
| 1480 | 04ZTP5KsCypmtCmQg5tH9R | I'm a Mess | Bebe Rexha | 80 | 4TOkZvtqNpg5UHyGxCn0mS | Expectations | 2018-06-22 | pop | post-teen pop | 0.630 | ... | 0 | 0.0253 | 0.00281 | 0.000000 | 0.0719 | 0.216 | 97.005 | 195519 | 0.0 | 1 |
| 20659 | 04aAxqtGp5pv12UXAg4pkq | Centuries | Fall Out Boy | 79 | 022DrG7Wp2PSCwzuD0bSzT | American Beauty/American Psycho | 2015-01-20 | latin | latin hip hop | 0.394 | ... | 0 | 0.0729 | 0.00359 | 0.000000 | 0.1020 | 0.560 | 176.044 | 228360 | 0.0 | 1 |
| 1898 | 04aAxqtGp5pv12UXAg4pkq | Centuries | Fall Out Boy | 79 | 022DrG7Wp2PSCwzuD0bSzT | American Beauty/American Psycho | 2015-01-20 | pop | post-teen pop | 0.394 | ... | 0 | 0.0729 | 0.00359 | 0.000000 | 0.1020 | 0.560 | 176.044 | 228360 | 0.0 | 1 |
| 29979 | 04wllvXvWOkZS5NugzeS8O | Lost | Chris Burke | 37 | 13FrOWDba1sGvu10ofSKFr | Lost | 2019-08-16 | edm | pop edm | 0.616 | ... | 0 | 0.0397 | 0.12100 | 0.022800 | 0.0311 | 0.189 | 128.008 | 272124 | 0.0 | 0 |
| 27251 | 04wllvXvWOkZS5NugzeS8O | Lost | Chris Burke | 37 | 13FrOWDba1sGvu10ofSKFr | Lost | 2019-08-16 | edm | electro house | 0.616 | ... | 0 | 0.0397 | 0.12100 | 0.022800 | 0.0311 | 0.189 | 128.008 | 272124 | 0.0 | 0 |
| 30800 | 05CwHjIk71RXVU40boRMnR | Call You Mine | The Chainsmokers | 39 | 1ONuDpN0a3zhCUyKCgtuzK | World War Joy | 2019-05-31 | edm | pop edm | 0.591 | ... | 1 | 0.0289 | 0.22500 | 0.000000 | 0.4140 | 0.501 | 104.003 | 217653 | 0.0 | 0 |
| 70 | 05CwHjIk71RXVU40boRMnR | Call You Mine | The Chainsmokers | 39 | 1ONuDpN0a3zhCUyKCgtuzK | World War Joy | 2019-05-31 | pop | dance pop | 0.591 | ... | 1 | 0.0289 | 0.22500 | 0.000000 | 0.4140 | 0.501 | 104.003 | 217653 | 0.0 | 0 |
| 4648 | 05CwHjIk71RXVU40boRMnR | Call You Mine | The Chainsmokers | 39 | 1ONuDpN0a3zhCUyKCgtuzK | World War Joy | 2019-05-31 | pop | indie poptimism | 0.591 | ... | 1 | 0.0289 | 0.22500 | 0.000000 | 0.4140 | 0.501 | 104.003 | 217653 | 0.0 | 0 |
| 14910 | 05RgAMGypEvqhNs5hPCbMS | Panama - 2015 Remaster | Van Halen | 73 | 3REUXdj5OPKhuDTrTtCBU0 | 1984 (Remastered) | 1984-01-04 | rock | hard rock | 0.527 | ... | 1 | 0.1090 | 0.00124 | 0.000048 | 0.0744 | 0.463 | 141.169 | 210227 | 0.0 | 1 |
| 13589 | 05RgAMGypEvqhNs5hPCbMS | Panama - 2015 Remaster | Van Halen | 73 | 3REUXdj5OPKhuDTrTtCBU0 | 1984 (Remastered) | 1984-01-04 | rock | classic rock | 0.527 | ... | 1 | 0.1090 | 0.00124 | 0.000048 | 0.0744 | 0.463 | 141.169 | 210227 | 0.0 | 1 |
| 30873 | 05SBRd4fXgn8FX7bf8BCAE | I Need Your Love (feat. Ellie Goulding) | Calvin Harris | 69 | 7w19PFbxAjwZ7UVNp9z0uT | 18 Months | 2012-10-26 | edm | pop edm | 0.695 | ... | 1 | 0.0483 | 0.41000 | 0.000000 | 0.2370 | 0.580 | 124.989 | 234507 | 0.0 | 1 |
| 20737 | 05SBRd4fXgn8FX7bf8BCAE | I Need Your Love (feat. Ellie Goulding) | Calvin Harris | 69 | 7w19PFbxAjwZ7UVNp9z0uT | 18 Months | 2012-10-26 | latin | latin hip hop | 0.695 | ... | 1 | 0.0483 | 0.41000 | 0.000000 | 0.2370 | 0.580 | 124.989 | 234507 | 0.0 | 1 |
| 1584 | 05SBRd4fXgn8FX7bf8BCAE | I Need Your Love (feat. Ellie Goulding) | Calvin Harris | 69 | 7w19PFbxAjwZ7UVNp9z0uT | 18 Months | 2012-10-26 | pop | post-teen pop | 0.695 | ... | 1 | 0.0483 | 0.41000 | 0.000000 | 0.2370 | 0.580 | 124.989 | 234507 | 0.0 | 1 |
60 rows × 23 columns
In [12]:
# now we must drop the duplicates now that we've identified them
df = df.drop_duplicates(subset = "track_id", inplace = False)
# make sure unique rows match
print(f"Unique rows - rows when duplicates dropped = {unique_rows - df.shape[0]}")
Unique rows - rows when duplicates dropped = 0
In [13]:
df.nunique()
Out[13]:
track_id 28356 track_name 23449 track_artist 10692 track_popularity 101 track_album_id 22545 track_album_name 19743 track_album_release_date 4530 playlist_genre 6 playlist_subgenre 24 danceability 822 energy 952 key 12 loudness 10222 mode 2 speechiness 1270 acousticness 3731 instrumentalness 4729 liveness 1624 valence 1362 tempo 17684 duration_ms 19785 0 1 popularity_category 2 dtype: int64
In [14]:
df["mode"].unique()
Out[14]:
array([1, 0])
In [15]:
# figuring out how to parse album/song release dates
print(type(df["track_album_release_date"]))
print(df["track_album_release_date"][21][:4])
print(df["track_album_release_date"][21][5:7])
print(df["track_album_release_date"][21][8:10])
<class 'pandas.core.series.Series'> 2019 08 23
In [16]:
df["date_len"] = df["track_album_release_date"].astype(str).str.len()
len = df["date_len"]
year_only = len[len != 10]
year_only[year_only != 4]
Out[16]:
3446 7 3524 7 7614 7 11740 7 11877 7 12208 7 12283 7 12512 7 12538 7 12764 7 12789 7 13135 7 13227 7 13365 7 13395 7 13408 7 13480 7 13576 7 14504 7 15266 7 15408 7 15990 7 22473 7 22744 7 22759 7 Name: date_len, dtype: int64
In [71]:
# Since we have varying levels of date data we only keep year of release
df["Release Year"] = ""
for i in range(unique_rows):
df.at[df.index[i], "Release Year"] = df["track_album_release_date"].iloc[i][:4]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File /opt/anaconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key) 3804 try: -> 3805 return self._engine.get_loc(casted_key) 3806 except KeyError as err: File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc() File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc() File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'track_album_release_date' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[71], line 4 2 df["Release Year"] = "" 3 for i in range(unique_rows): ----> 4 df.at[df.index[i], "Release Year"] = df["track_album_release_date"].iloc[i][:4] File /opt/anaconda3/lib/python3.12/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key) 4100 if self.columns.nlevels > 1: 4101 return self._getitem_multilevel(key) -> 4102 indexer = self.columns.get_loc(key) 4103 if is_integer(indexer): 4104 indexer = [indexer] File /opt/anaconda3/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key) 3807 if isinstance(casted_key, slice) or ( 3808 isinstance(casted_key, abc.Iterable) 3809 and any(isinstance(x, slice) for x in casted_key) 3810 ): 3811 raise InvalidIndexError(key) -> 3812 raise KeyError(key) from err 3813 except TypeError: 3814 # If we have a listlike key, _check_indexing_error will raise 3815 # InvalidIndexError. Otherwise we fall through and re-raise 3816 # the TypeError. 3817 self._check_indexing_error(key) KeyError: 'track_album_release_date'
In [18]:
# Don't need the full date or date_len columns anymore
del df["track_album_release_date"]
del df["date_len"]
In [19]:
df.columns
Out[19]:
Index([ 'track_id', 'track_name', 'track_artist',
'track_popularity', 'track_album_id', 'track_album_name',
'playlist_genre', 'playlist_subgenre', 'danceability',
'energy', 'key', 'loudness',
'mode', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence',
'tempo', 'duration_ms', 0,
'popularity_category', 'Release Year'],
dtype='object')
In [20]:
df = df.rename(columns = {'track_id': 'ID',
'track_name' : 'Title',
'track_artist': 'artist',
'track_album_id': 'Album ID',
'track_album_name': 'Album',
'playlist_genre': 'genre',
'playlist_subgenre': 'subgenre',
'danceability': 'danceability',
'energy': 'energy',
'key': 'key',
'loudness': 'loudness',
'mode': 'mode',
'speechiness': 'speechiness',
'acousticness': 'acousticness',
'instrumentalness': 'instrumentalness',
'liveness': 'liveness',
'valence': 'valence',
'tempo': 'tempo',
'duration_ms': 'Song Length',
'track_popularity': 'Popularity',
'popularity_category': 'popularity_category',
'Release Year': 'Release Year'
})
In [21]:
df = df[['ID',
'Title',
'artist',
'Album ID',
'Album',
'genre',
'subgenre',
'danceability',
'energy',
'key',
'loudness',
'mode',
'speechiness',
'acousticness',
'instrumentalness',
'liveness',
'valence',
'tempo',
'Song Length',
'Popularity',
'popularity_category',
'Release Year']]
In [22]:
df.describe()
Out[22]:
| danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | Song Length | Popularity | popularity_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.00000 | 28356.000000 | 28356.000000 | 28356.000000 |
| mean | 0.653372 | 0.698388 | 5.368000 | -6.817696 | 0.565489 | 0.107954 | 0.177176 | 0.091117 | 0.190958 | 0.510387 | 120.95618 | 226575.967026 | 39.329771 | 0.449922 |
| std | 0.145785 | 0.183503 | 3.613904 | 3.036243 | 0.495701 | 0.102556 | 0.222803 | 0.232548 | 0.155894 | 0.234340 | 26.95456 | 61078.450819 | 23.702376 | 0.497495 |
| min | 0.000000 | 0.000175 | 0.000000 | -46.448000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 4000.000000 | 0.000000 | 0.000000 |
| 25% | 0.561000 | 0.579000 | 2.000000 | -8.309250 | 0.000000 | 0.041000 | 0.014375 | 0.000000 | 0.092600 | 0.329000 | 99.97200 | 187742.000000 | 21.000000 | 0.000000 |
| 50% | 0.670000 | 0.722000 | 6.000000 | -6.261000 | 1.000000 | 0.062600 | 0.079700 | 0.000021 | 0.127000 | 0.512000 | 121.99300 | 216933.000000 | 42.000000 | 0.000000 |
| 75% | 0.760000 | 0.843000 | 9.000000 | -4.709000 | 1.000000 | 0.133000 | 0.260000 | 0.006570 | 0.249000 | 0.695000 | 133.99900 | 254975.250000 | 58.000000 | 1.000000 |
| max | 0.983000 | 1.000000 | 11.000000 | 1.275000 | 1.000000 | 0.918000 | 0.994000 | 0.994000 | 0.996000 | 0.991000 | 239.44000 | 517810.000000 | 100.000000 | 1.000000 |
In [23]:
df_num = df[['danceability',
'energy',
'key',
'loudness',
'mode',
'speechiness',
'acousticness',
'instrumentalness',
'liveness',
'valence',
'tempo',
'Song Length',
'Popularity']]
In [24]:
# Check for collinearity of numeric data
correlations = df_num.corr()
plt.figure(figsize = (12,10))
sns.heatmap(correlations, annot = True, cmap = "BuPu")
plt.show()
In [25]:
sns.pairplot(df_num)
Out[25]:
<seaborn.axisgrid.PairGrid at 0x122ab89e0>
In [26]:
# Remove all NaN data
df.dropna(inplace = True)
In [27]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 28352 entries, 0 to 32832 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 28352 non-null object 1 Title 28352 non-null object 2 artist 28352 non-null object 3 Album ID 28352 non-null object 4 Album 28352 non-null object 5 genre 28352 non-null object 6 subgenre 28352 non-null object 7 danceability 28352 non-null float64 8 energy 28352 non-null float64 9 key 28352 non-null int64 10 loudness 28352 non-null float64 11 mode 28352 non-null int64 12 speechiness 28352 non-null float64 13 acousticness 28352 non-null float64 14 instrumentalness 28352 non-null float64 15 liveness 28352 non-null float64 16 valence 28352 non-null float64 17 tempo 28352 non-null float64 18 Song Length 28352 non-null int64 19 Popularity 28352 non-null int64 20 popularity_category 28352 non-null int64 21 Release Year 28352 non-null object dtypes: float64(9), int64(5), object(8) memory usage: 5.0+ MB
In [28]:
# Data is now ready for analysis
In [29]:
artist_counts = df["artist"].value_counts()
famous = artist_counts[artist_counts > 50]
famous.index
Out[29]:
Index(['Queen', 'Martin Garrix', 'Don Omar', 'David Guetta',
'Dimitri Vegas & Like Mike', 'Drake', 'Hardwell', 'The Chainsmokers',
'Logic', 'Guns N' Roses', '2Pac', 'The Weeknd', 'Wisin & Yandel'],
dtype='object', name='artist')
In [30]:
def artist_plot(artist, df):
artist_name = artist
filtered_data = df[df['artist'] == artist_name]
# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(filtered_data['Release Year'], filtered_data['Popularity'], color='blue')
plt.title(f'{artist_name} - Popularity Score of Songs')
plt.xlabel('Release Year')
plt.ylabel('Popularity Score')
plt.xticks(rotation=90) # Rotate song names for readability
plt.tight_layout() # Adjust layout to prevent clipping of labels
plt.show()
In [31]:
artist_plot("Queen", df)
In [32]:
artist_plot("Drake", df)
In [33]:
def test_artist_affect(df):
# Group the data by artist and extract popularity scores for each artist
grouped_data = [group['Popularity'].values for _, group in df.groupby('artist')]
# Perform ANOVA test
f_statistic, p_value = stats.f_oneway(*grouped_data)
# Output the results
print(f'F-statistic: {f_statistic}')
print(f'P-value: {p_value}')
# Interpret the results
if p_value < 0.05:
print('There is significant evidence that the artist affects the popularity score.')
else:
print('There is no significant evidence that the artist affects the popularity score.')
test_artist_affect(df)
F-statistic: 1.9805636759344931 P-value: 0.0 There is significant evidence that the artist affects the popularity score.
In [34]:
artist_plot("The Weeknd", df)
In [35]:
plt.figure(figsize=(12, 8))
# Create a violin plot to show the distribution of 'Popularity' for each 'Release Year' in sorted order
sns.violinplot(x='Release Year', y='Popularity', data=df, order=sorted(df['Release Year'].unique()))
plt.title('Distribution of Popularity Scores by Release Year')
plt.xlabel('Release Year')
plt.ylabel('Popularity')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
In [36]:
# Create a DF called y with all rows and just the target column
# Reshape your into a 1-dimensional NumPy array object
df_log_reg = df[[ "danceability",
"energy",
"key",
"loudness",
"mode",
"speechiness",
"acousticness",
"instrumentalness",
"liveness",
"valence",
"tempo",
"Song Length",
"popularity_category"]]
y = df_log_reg.iloc[:, -1:].values.reshape(-1, )
print("y:", y.shape)
print(type(y))
print()
# create a DF called X with all rows and all columns before target
X = df_log_reg.iloc[:, :-1]
print("X shape: ", X.shape)
print(type(X))
y: (28352,) <class 'numpy.ndarray'> X shape: (28352, 12) <class 'pandas.core.frame.DataFrame'>
In [37]:
# Imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# create an instance of sk;earns LogisticRegression
model = LogisticRegression(max_iter = 200, multi_class = 'multinomial', solver = 'lbfgs')
# remove intercept column from independent variables in X
X = X.iloc[:, :-1]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
print(X_scaled)
print()
danceability energy key loudness mode speechiness \
0 0.649057 1.185948 0.175052 1.377883 0.876494 -0.484071
1 0.498153 0.635555 1.558684 0.608875 0.876494 -0.688858
2 0.148330 1.267690 -1.208579 1.115070 -1.140909 -0.329017
3 0.443279 1.262240 0.451779 1.001119 0.876494 -0.057918
4 -0.023151 0.733645 -1.208579 0.706689 0.876494 -0.702510
... ... ... ... ... ... ...
28347 -1.545909 1.218645 -0.931853 1.647942 0.876494 -0.139833
28348 -0.901138 0.477521 -1.485305 0.775851 0.876494 -0.643025
28349 -0.853123 0.668251 0.175052 0.631929 -1.140909 -0.583539
28350 -0.187774 1.033364 -0.931853 1.138453 0.876494 0.010344
28351 -0.345537 1.011566 -0.101674 0.739953 -1.140909 -0.677156
acousticness instrumentalness liveness valence tempo
0 -0.337471 -0.391856 -0.806073 0.032493 0.039986
1 -0.470319 -0.373753 1.065178 0.779272 -0.778593
2 -0.438902 -0.391756 -0.519323 0.437888 0.113148
3 -0.666450 -0.391816 0.083685 -0.995928 0.037018
4 -0.434863 -0.391856 -0.690604 0.915826 0.111960
... ... ... ... ... ...
28347 -0.451469 -0.391856 -0.796451 -1.281838 0.267559
28348 -0.787584 -0.373495 1.180648 -0.471049 0.262773
28349 -0.310542 -0.391852 -0.262724 -0.317426 0.260843
28350 -0.759713 0.154243 0.975368 -0.863642 0.261548
28351 -0.794662 1.074443 3.534947 -1.796475 0.260658
[28352 rows x 11 columns]
In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 10, shuffle = True)# create cross-tabulation and look at the percentage of each category for the target variable
count_train = pd.crosstab(index = y_train, columns = "Percent")
print("Training Target Variable:")
print()
print(count_train / count_train.sum())
print()
count_test = pd.crosstab(index = y_test, columns = "Percent")
print("Testing Target Variable:")
print()
print(count_test / count_test.sum())
print()
Training Target Variable: col_0 Percent row_0 0 0.547969 1 0.452031 Testing Target Variable: col_0 Percent row_0 0 0.554785 1 0.445215
In [39]:
# Fit the model to the training data.
model.fit(X_train, y_train)
# Output the y-intercept and regression coefficients.
print("Intercept :" + str(model.intercept_))
print("Coefficients: " + str(model.coef_))
print()
# Create a new DataFrame called df_results.
# Add the predictor names and coefficient values to df_results.
# Take the exponents of the coefficients to create the odds ratios.
# Add the odds ratio values to df_results.
my_dictionary = {
"predictor" : X_train.columns,
"coefficient": model.coef_[0]
}
df_results = pd.DataFrame(my_dictionary)
df_results["odds_ratios"] = np.exp(df_results["coefficient"])
# Sort the results by odds ratio descending.
# Output the results.
df_results = df_results.sort_values("odds_ratios", ascending = False)
df_results
Intercept :[-0.10243594] Coefficients: [[ 0.02152963 -0.17891547 0.00335629 0.1622179 0.01531719 -0.0129159 0.04643728 -0.08183558 -0.01856358 0.02952272 0.03665443]]
Out[39]:
| predictor | coefficient | odds_ratios | |
|---|---|---|---|
| 3 | loudness | 0.162218 | 1.176116 |
| 6 | acousticness | 0.046437 | 1.047532 |
| 10 | tempo | 0.036654 | 1.037334 |
| 9 | valence | 0.029523 | 1.029963 |
| 0 | danceability | 0.021530 | 1.021763 |
| 4 | mode | 0.015317 | 1.015435 |
| 2 | key | 0.003356 | 1.003362 |
| 5 | speechiness | -0.012916 | 0.987167 |
| 8 | liveness | -0.018564 | 0.981608 |
| 7 | instrumentalness | -0.081836 | 0.921423 |
| 1 | energy | -0.178915 | 0.836177 |
In [40]:
y_predictions = model.predict(X_test)
# Compare y_test to y_predictions by creating a new DataFrame called df_compare with y_test in one (1) column and y_predictions in another column.
data = {
"actual": y_test,
"predicted": y_predictions
}
df_compare = pd.DataFrame(data)
df_compare.head(10)
Out[40]:
| actual | predicted | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 0 | 0 |
| 2 | 0 | 1 |
| 3 | 1 | 1 |
| 4 | 1 | 0 |
| 5 | 0 | 0 |
| 6 | 1 | 0 |
| 7 | 0 | 1 |
| 8 | 1 | 0 |
| 9 | 1 | 0 |
In [41]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predictions))
precision recall f1-score support
0 0.60 0.75 0.67 4719
1 0.54 0.36 0.43 3787
accuracy 0.58 8506
macro avg 0.57 0.56 0.55 8506
weighted avg 0.57 0.58 0.56 8506
In [42]:
df['loud_minus_energy'] = df['loudness'] - df['energy']
df['acoustic_vs_instr'] = df['acousticness'] - df['instrumentalness']
df['loud_acoustic_product'] = df['loudness'] * df['acousticness']
df['inv_energy'] = 1 / (df['energy'] + 1e-5)
df['valence_tempo'] = df['valence'] * df['tempo']
df_new = df[['loud_minus_energy','acoustic_vs_instr','loud_acoustic_product','inv_energy','valence_tempo','key','danceability','speechiness','mode','Song Length','popularity_category']]
In [43]:
model2 = LogisticRegression(max_iter = 200, multi_class = 'multinomial', solver = 'lbfgs')
In [44]:
y = df_new.iloc[:, -1:].values.reshape(-1, )
print("y:", y.shape)
print(type(y))
print()
# create a DF called X with all rows and all columns before target
X = df_new.iloc[:, :-1]
print("X shape: ", X.shape)
print(type(X))
y: (28352,) <class 'numpy.ndarray'> X shape: (28352, 10) <class 'pandas.core.frame.DataFrame'>
In [45]:
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
print(X_scaled)
print()
loud_minus_energy acoustic_vs_instr loud_acoustic_product \
0 1.360932 0.049407 0.484794
1 0.594364 -0.055407 0.447710
2 1.081962 -0.020727 0.483233
3 0.963579 -0.177859 0.550030
4 0.690099 -0.017865 0.441438
... ... ... ...
28347 1.640245 -0.029335 0.537604
28348 0.778286 -0.274738 0.591067
28349 0.616325 0.068005 0.378770
28350 1.121079 -0.635961 0.583337
28351 0.707256 -1.323521 0.593926
inv_energy valence_tempo key danceability speechiness \
0 -0.022582 0.051058 0.175052 0.649057 -0.484071
1 -0.018370 0.240196 1.558684 0.498153 -0.688858
2 -0.023130 0.450236 -1.208579 0.148330 -0.329017
3 -0.023094 -0.866664 0.451779 0.443279 -0.057918
4 -0.019196 0.882571 -1.208579 -0.023151 -0.702510
... ... ... ... ... ...
28347 -0.022803 -1.080751 -0.931853 -1.545909 -0.139833
28348 -0.016961 -0.323050 -1.485305 -0.901138 -0.643025
28349 -0.018650 -0.180032 0.175052 -0.853123 -0.583539
28350 -0.021510 -0.690663 -0.931853 -0.187774 0.010344
28351 -0.021352 -1.563232 -0.101674 -0.345537 -0.677156
mode Song Length
0 0.876494 -0.520964
1 0.876494 -1.047386
2 -1.140909 -0.817917
3 0.876494 -0.941083
4 0.876494 -0.614317
... ... ...
28347 0.876494 -0.363450
28348 0.876494 2.071787
28349 -1.140909 -0.269524
28350 0.876494 2.306102
28351 -1.140909 1.816058
[28352 rows x 10 columns]
In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 10, shuffle = True)# create cross-tabulation and look at the percentage of each category for the target variable
count_train = pd.crosstab(index = y_train, columns = "Percent")
print("Training Target Variable:")
print()
print(count_train / count_train.sum())
print()
count_test = pd.crosstab(index = y_test, columns = "Percent")
print("Testing Target Variable:")
print()
print(count_test / count_test.sum())
print()
Training Target Variable: col_0 Percent row_0 0 0.547969 1 0.452031 Testing Target Variable: col_0 Percent row_0 0 0.554785 1 0.445215
In [47]:
model2.fit(X_train, y_train)
# Output the y-intercept and regression coefficients.
print("Intercept :" + str(model2.intercept_))
print("Coefficients: " + str(model2.coef_))
print()
# Create a new DataFrame called df_results.
# Add the predictor names and coefficient values to df_results.
# Take the exponents of the coefficients to create the odds ratios.
# Add the odds ratio values to df_results.
my_dictionary = {
"predictor" : X_train.columns,
"coefficient": model2.coef_[0]
}
df_results = pd.DataFrame(my_dictionary)
df_results["odds_ratios"] = np.exp(df_results["coefficient"])
# Sort the results by odds ratio descending.
# Output the results.
df_results = df_results.sort_values("odds_ratios", ascending = False)
df_results
Intercept :[-0.0883006] Coefficients: [[ 6.62847167e-02 1.39628222e-01 3.83794418e-02 2.34756361e+00 2.16869635e-02 2.24259121e-03 2.94896491e-02 -1.99390399e-02 1.42820690e-02 -9.62607172e-02]]
Out[47]:
| predictor | coefficient | odds_ratios | |
|---|---|---|---|
| 3 | inv_energy | 2.347564 | 10.460054 |
| 1 | acoustic_vs_instr | 0.139628 | 1.149846 |
| 0 | loud_minus_energy | 0.066285 | 1.068531 |
| 2 | loud_acoustic_product | 0.038379 | 1.039125 |
| 6 | danceability | 0.029490 | 1.029929 |
| 4 | valence_tempo | 0.021687 | 1.021924 |
| 8 | mode | 0.014282 | 1.014385 |
| 5 | key | 0.002243 | 1.002245 |
| 7 | speechiness | -0.019939 | 0.980258 |
| 9 | Song Length | -0.096261 | 0.908227 |
In [48]:
y_predictions = model2.predict(X_test)
# Compare y_test to y_predictions by creating a new DataFrame called df_compare with y_test in one (1) column and y_predictions in another column.
data = {
"actual": y_test,
"predicted": y_predictions
}
df_compare = pd.DataFrame(data)
df_compare.head(10)
Out[48]:
| actual | predicted | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 0 | 0 |
| 2 | 0 | 1 |
| 3 | 1 | 1 |
| 4 | 1 | 0 |
| 5 | 0 | 0 |
| 6 | 1 | 0 |
| 7 | 0 | 0 |
| 8 | 1 | 0 |
| 9 | 1 | 0 |
In [49]:
print(classification_report(y_test, y_predictions))
precision recall f1-score support
0 0.59 0.77 0.67 4719
1 0.54 0.34 0.41 3787
accuracy 0.58 8506
macro avg 0.57 0.55 0.54 8506
weighted avg 0.57 0.58 0.56 8506
In [50]:
# Let's consider top 3 positive features and top 3 negative features
# Positive: inv_energy, loudness, acoustic_vs_instr
# Negative: energy, song length, liveness
df_final = df[['inv_energy', 'loudness', 'acoustic_vs_instr', 'energy', 'Song Length', 'liveness','popularity_category']]
In [51]:
model3 = LogisticRegression(max_iter = 200, multi_class = 'multinomial', solver = 'lbfgs')
In [52]:
y = df_final.iloc[:, -1:].values.reshape(-1, )
print("y:", y.shape)
print(type(y))
print()
# create a DF called X with all rows and all columns before target
X = df_final.iloc[:, :-1]
print("X shape: ", X.shape)
print(type(X))
y: (28352,) <class 'numpy.ndarray'> X shape: (28352, 6) <class 'pandas.core.frame.DataFrame'>
In [53]:
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
print(X_scaled)
print()
inv_energy loudness acoustic_vs_instr energy Song Length \
0 -0.022582 1.377883 0.049407 1.185948 -0.520964
1 -0.018370 0.608875 -0.055407 0.635555 -1.047386
2 -0.023130 1.115070 -0.020727 1.267690 -0.817917
3 -0.023094 1.001119 -0.177859 1.262240 -0.941083
4 -0.019196 0.706689 -0.017865 0.733645 -0.614317
... ... ... ... ... ...
28347 -0.022803 1.647942 -0.029335 1.218645 -0.363450
28348 -0.016961 0.775851 -0.274738 0.477521 2.071787
28349 -0.018650 0.631929 0.068005 0.668251 -0.269524
28350 -0.021510 1.138453 -0.635961 1.033364 2.306102
28351 -0.021352 0.739953 -1.323521 1.011566 1.816058
liveness
0 -0.806073
1 1.065178
2 -0.519323
3 0.083685
4 -0.690604
... ...
28347 -0.796451
28348 1.180648
28349 -0.262724
28350 0.975368
28351 3.534947
[28352 rows x 6 columns]
In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 10, shuffle = True)
In [55]:
model3.fit(X_train, y_train)
# Output the y-intercept and regression coefficients.
print("Intercept :" + str(model3.intercept_))
print("Coefficients: " + str(model3.coef_))
print()
# Create a new DataFrame called df_results.
# Add the predictor names and coefficient values to df_results.
# Take the exponents of the coefficients to create the odds ratios.
# Add the odds ratio values to df_results.
my_dictionary = {
"predictor" : X_train.columns,
"coefficient": model3.coef_[0]
}
df_results = pd.DataFrame(my_dictionary)
df_results["odds_ratios"] = np.exp(df_results["coefficient"])
# Sort the results by odds ratio descending.
# Output the results.
df_results = df_results.sort_values("odds_ratios", ascending = False)
df_results
Intercept :[-0.10304979] Coefficients: [[ 0.00419707 0.14531476 0.09011132 -0.14829699 -0.09264571 -0.0226359 ]]
Out[55]:
| predictor | coefficient | odds_ratios | |
|---|---|---|---|
| 1 | loudness | 0.145315 | 1.156404 |
| 2 | acoustic_vs_instr | 0.090111 | 1.094296 |
| 0 | inv_energy | 0.004197 | 1.004206 |
| 5 | liveness | -0.022636 | 0.977618 |
| 4 | Song Length | -0.092646 | 0.911516 |
| 3 | energy | -0.148297 | 0.862175 |
In [56]:
y_predictions = model3.predict(X_test)
# Compare y_test to y_predictions by creating a new DataFrame called df_compare with y_test in one (1) column and y_predictions in another column.
data = {
"actual": y_test,
"predicted": y_predictions
}
df_compare = pd.DataFrame(data)
df_compare.head(10)
Out[56]:
| actual | predicted | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 0 | 1 |
| 2 | 0 | 1 |
| 3 | 1 | 0 |
| 4 | 1 | 0 |
| 5 | 0 | 1 |
| 6 | 1 | 0 |
| 7 | 0 | 0 |
| 8 | 1 | 0 |
| 9 | 1 | 0 |
In [57]:
print(classification_report(y_test, y_predictions))
precision recall f1-score support
0 0.60 0.75 0.67 4719
1 0.55 0.37 0.44 3787
accuracy 0.58 8506
macro avg 0.57 0.56 0.55 8506
weighted avg 0.58 0.58 0.57 8506
In [58]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 28352 entries, 0 to 32832 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 28352 non-null object 1 Title 28352 non-null object 2 artist 28352 non-null object 3 Album ID 28352 non-null object 4 Album 28352 non-null object 5 genre 28352 non-null object 6 subgenre 28352 non-null object 7 danceability 28352 non-null float64 8 energy 28352 non-null float64 9 key 28352 non-null int64 10 loudness 28352 non-null float64 11 mode 28352 non-null int64 12 speechiness 28352 non-null float64 13 acousticness 28352 non-null float64 14 instrumentalness 28352 non-null float64 15 liveness 28352 non-null float64 16 valence 28352 non-null float64 17 tempo 28352 non-null float64 18 Song Length 28352 non-null int64 19 Popularity 28352 non-null int64 20 popularity_category 28352 non-null int64 21 Release Year 28352 non-null object 22 loud_minus_energy 28352 non-null float64 23 acoustic_vs_instr 28352 non-null float64 24 loud_acoustic_product 28352 non-null float64 25 inv_energy 28352 non-null float64 26 valence_tempo 28352 non-null float64 dtypes: float64(14), int64(5), object(8) memory usage: 6.1+ MB
In [59]:
from sklearn.tree import DecisionTreeClassifier
tree_features = df[df.columns[2:]]
tree_features = tree_features[['genre', 'subgenre','danceability', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'Song Length', 'Release Year', 'loud_minus_energy', 'acoustic_vs_instr', 'inv_energy', 'valence_tempo', 'popularity_category']]
columns_to_keep = ['danceability', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'Song Length', 'Release Year', 'loud_minus_energy', 'acoustic_vs_instr', 'inv_energy', 'valence_tempo', 'popularity_category']
scaler = StandardScaler()
numerical_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence', 'tempo', 'Song Length']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
genre_dummies = pd.get_dummies(tree_features["genre"], prefix = "genre")
subgenre_dummies = pd.get_dummies(tree_features["subgenre"], prefix = "subgenre")
tree_features = tree_features[columns_to_keep]
In [60]:
# Temporarily remove the target column
popularity_col = tree_features["popularity_category"]
tree_features = tree_features.drop(columns=["popularity_category"])
# Join dummy variables
tree_features = tree_features.join(genre_dummies.loc[:, "genre_2":])
tree_features = tree_features.join(subgenre_dummies.loc[:, "subgenre_2":])
In [61]:
# Add the target column back as the last column
tree_features["popularity_category"] = popularity_col
X = tree_features.iloc[:,:-1]
y = tree_features.iloc[:,-1:].values.reshape(-1,)
In [62]:
X.info()
<class 'pandas.core.frame.DataFrame'> Index: 28352 entries, 0 to 32832 Data columns (total 46 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 danceability 28352 non-null float64 1 key 28352 non-null int64 2 loudness 28352 non-null float64 3 mode 28352 non-null int64 4 speechiness 28352 non-null float64 5 acousticness 28352 non-null float64 6 instrumentalness 28352 non-null float64 7 liveness 28352 non-null float64 8 valence 28352 non-null float64 9 tempo 28352 non-null float64 10 Song Length 28352 non-null int64 11 Release Year 28352 non-null object 12 loud_minus_energy 28352 non-null float64 13 acoustic_vs_instr 28352 non-null float64 14 inv_energy 28352 non-null float64 15 valence_tempo 28352 non-null float64 16 genre_edm 28352 non-null bool 17 genre_latin 28352 non-null bool 18 genre_pop 28352 non-null bool 19 genre_r&b 28352 non-null bool 20 genre_rap 28352 non-null bool 21 genre_rock 28352 non-null bool 22 subgenre_album rock 28352 non-null bool 23 subgenre_big room 28352 non-null bool 24 subgenre_classic rock 28352 non-null bool 25 subgenre_dance pop 28352 non-null bool 26 subgenre_electro house 28352 non-null bool 27 subgenre_electropop 28352 non-null bool 28 subgenre_gangster rap 28352 non-null bool 29 subgenre_hard rock 28352 non-null bool 30 subgenre_hip hop 28352 non-null bool 31 subgenre_hip pop 28352 non-null bool 32 subgenre_indie poptimism 28352 non-null bool 33 subgenre_latin hip hop 28352 non-null bool 34 subgenre_latin pop 28352 non-null bool 35 subgenre_neo soul 28352 non-null bool 36 subgenre_new jack swing 28352 non-null bool 37 subgenre_permanent wave 28352 non-null bool 38 subgenre_pop edm 28352 non-null bool 39 subgenre_post-teen pop 28352 non-null bool 40 subgenre_progressive electro house 28352 non-null bool 41 subgenre_reggaeton 28352 non-null bool 42 subgenre_southern hip hop 28352 non-null bool 43 subgenre_trap 28352 non-null bool 44 subgenre_tropical 28352 non-null bool 45 subgenre_urban contemporary 28352 non-null bool dtypes: bool(30), float64(12), int64(3), object(1) memory usage: 4.5+ MB
In [63]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42, shuffle = True)
count_train = pd.crosstab(index = y_train, columns = "Percent")
print("Training Target Variable:")
print()
print(count_train / count_train.sum())
print()
count_test = pd.crosstab(index = y_test, columns = "Percent")
print("Testing Target Variable:")
print()
print(count_test / count_test.sum())
print()
Training Target Variable: col_0 Percent row_0 0 0.551698 1 0.448302 Testing Target Variable: col_0 Percent row_0 0 0.546085 1 0.453915
In [64]:
clf = DecisionTreeClassifier(criterion = "entropy", max_depth = 5)
In [65]:
clf = clf.fit(X_train, y_train)
In [66]:
y_pred = clf.predict(X_test)
In [67]:
data = {
"actual": y_test,
"predicted": y_pred
}
df_comp = pd.DataFrame(data)
df_comp.head(30)
Out[67]:
| actual | predicted | |
|---|---|---|
| 0 | 1 | 0 |
| 1 | 0 | 0 |
| 2 | 1 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 0 |
| 5 | 1 | 1 |
| 6 | 0 | 0 |
| 7 | 0 | 1 |
| 8 | 0 | 0 |
| 9 | 1 | 1 |
| 10 | 0 | 0 |
| 11 | 1 | 0 |
| 12 | 0 | 0 |
| 13 | 1 | 1 |
| 14 | 0 | 0 |
| 15 | 0 | 1 |
| 16 | 1 | 0 |
| 17 | 1 | 0 |
| 18 | 0 | 0 |
| 19 | 0 | 0 |
| 20 | 1 | 0 |
| 21 | 0 | 0 |
| 22 | 1 | 0 |
| 23 | 1 | 0 |
| 24 | 1 | 0 |
| 25 | 0 | 1 |
| 26 | 1 | 1 |
| 27 | 1 | 1 |
| 28 | 0 | 0 |
| 29 | 1 | 0 |
In [68]:
print("Accuracy: ",accuracy_score(y_test,y_pred))
Accuracy: 0.6177992005643076
In [69]:
print("Precision: ",precision_score(y_test, y_pred, pos_label = 1))
Precision: 0.6683222958057395
In [70]:
print("Recall: ", recall_score(y_test, y_pred, pos_label = 1))
Recall: 0.31364931364931364
In [ ]:
In [ ]: